# Title: 7-create-panel-data.R
# Description: Replication file "Witch Trials."
# Authors: Peter T. Leeson and Jacob W. Russ

# Load R packages -------------------------------------------------------------

# library(readr)
# library(dplyr)
# library(tidyr)
# library(readxl)
# library(magrittr)

# Import datasets -------------------------------------------------------------

# Don't run: use CCSM4.0 climate data and our own script (later)
#source(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "code", "1-create-weather.R"))
# weather <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "weather.csv"), header=TRUE, stringsAsFactors=FALSE)

#source(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "code", "2-create-population.R"))
population <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "population.csv"), header=TRUE, stringsAsFactors=FALSE)
#source(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "code", "3-create-urbanization.R"))
urban <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "urbanization.csv"), header=TRUE, stringsAsFactors=FALSE)
#source(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "code", "4-create-real-wage-taxes.R"))
real_wages <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "real_wages.csv"), header=TRUE, stringsAsFactors=FALSE)
# taxes <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "taxes.csv"), header=TRUE, stringsAsFactors=FALSE)
taxes <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "taxes-manually-modified.csv"), header=TRUE, stringsAsFactors=FALSE)

# Don't run: use CCSM4.0 climate data and our own script (later)
#source(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "code", "5-create-temperature.R"))
# temperature <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "temperature.csv"), header=TRUE, stringsAsFactors=FALSE)

# Recode United Kingdom countries to GADM 1 Regions ---------------------------

trials <- trials %>%
  mutate(country = if_else(condition = gadm.adm0 %in% "United Kingdom", 
                           true      = gadm.adm1,
                           false     = gadm.adm0))

battles <- battles %>%
  mutate(country = if_else(condition = gadm.adm0 %in% "United Kingdom", 
                           true      = gadm.adm1,
                           false     = gadm.adm0))

# Expand the trials data to make a balanced panel -----------------------------

# this line is unnecessary and seems leftover?
# df <- data_frame(decade = c(1800, 1810, 1840), tried = 0, country = "Austria")

trials_by_country_decade <- trials %>%
  # Add decades that do not appear in the data before using complete
  bind_rows(data_frame(decade  = c(1800, 1810, 1840),
                       tried   = 0,
                       country = "Austria")) %>%
  # Fill in "missing" observations with zero trials
  complete(country, decade, fill = list(tried = 0)) %>%
  group_by(country, decade) %>%
  summarise(trials = sum(tried, na.rm = TRUE)) %>%
  # Ad century via decade
  mutate(century = (decade %/% 100) * 100) %>%
  dplyr::select(country, decade, century, trials) %>%
  ungroup() # add this line to prevent unancitipated outcomes later

battles_by_country_decade <- battles %>%
  group_by(country, decade) %>%
  summarise(battles = n()) %>%
  ungroup() # add this line to prevent unancitipated outcomes later

# Merge datasets to create panel ----------------------------------------------

# This creates a country-decade panel that LR use for some models. We are not interested in this, at least at this time.

combined <- trials_by_country_decade %>%
  left_join(y = battles_by_country_decade, by = c("country", "decade")) %>%
  replace_na(list(battles = 0)) %>%
  left_join(y = population, by = c("country", "decade")) %>%
  mutate(trials.mil        = ((trials / population) * 1000000) %>% 
           round(digits = 3),
         battles.mil       = ((battles / population) * 1000000) %>% 
           round(digits = 3),
         ln.trials         = if_else(trials %in% 0, NA_real_, log(trials)),
         ln.trials.mil     = if_else(trials.mil %in% 0, NA_real_, log(trials.mil)),
         ln1p.trials       = log1p(trials),
         ln1p.trials.mil   = log1p(trials.mil)) %>%
# Add three "future" battles columns for the placebo test. For leads or lags to work correctly, we need to sort the data frame and use only one grouping variable. In this case use country name.
  arrange(country, decade) %>%
  group_by(country) %>%
  mutate(battles.tp1        = lead(battles, 1),
         battles.tp2        = lead(battles, 2),
         battles.tp3        = lead(battles, 3),
         battles.mil.tp1    = lead(battles.mil, 1),
         battles.mil.tp2    = lead(battles.mil, 2),
         battles.mil.tp3    = lead(battles.mil, 3))

# Add external sources

combined <- combined %>%
  # left_join(y = weather,   by = c("country", "decade")) %>%
  left_join(y = urban,     by = c("country", "century")) %>%
  left_join(y = real_wages, by = c("country", "decade")) %>%
  left_join(y = dplyr::select(taxes, -revenues), by = c("country", "decade")) # %>%
  # left_join(y = temperature, by = c("country", "decade"))

# Export series to CSV --------------------------------------------

# write_csv(combined, "data/clean/panel_dataset.csv")
write_csv(combined, here::here("Witchtrial_diffusion", "out_data", "panel_dataset.csv"))

  